{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这里基于电影推荐的数据集实现一下SVD算法， 看看效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T06:43:23.043847Z",
     "start_time": "2020-08-27T06:43:22.451965Z"
    }
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import math"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读入数据\n",
    "读取文件得到\"用户-电影\"的评分数据， 并且分为训练集和测试集， 这里的思想是首先给出数据存在的路径， 然后通过pandas读取数据， 然后遍历该数据集， 把相应的数据存放到字典中， 这里之所以会用字典， 是因为用户对电影的评分会存在大量的稀疏。 所以我们依然需要建立一个\"{用户：{电影: 评分}}\"的这样一个字典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T06:44:01.911620Z",
     "start_time": "2020-08-27T06:44:01.862757Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964983815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964982931</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp\n",
       "0       1        1     4.0  964982703\n",
       "1       1        3     4.0  964981247\n",
       "2       1        6     4.0  964982224\n",
       "3       1       47     5.0  964983815\n",
       "4       1       50     5.0  964982931"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv('ratings.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T06:44:31.161616Z",
     "start_time": "2020-08-27T06:44:30.986086Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Split trainingSet and testSet success!\n",
      "TrainSet = 75628\n",
      "TestSet = 25208\n"
     ]
    }
   ],
   "source": [
    "# 声明两个字典， 分别是训练集和测试集\n",
    "trainSet, testSet = {}, {}\n",
    "trainSet_len, testSet_len = 0, 0\n",
    "pivot = 0.75    # 训练集的比例\n",
    "\n",
    "# 遍历data的每一行， 把userId, movidId, rating按照{user: {movidId: rating}}的方式存储， 当然定义一个随机种子进行数据集划分\n",
    "for ele in data.itertuples():   # 遍历行这里推荐用itertuples， 比iterrows会高效很多\n",
    "    user, movie, rating = getattr(ele, 'userId'), getattr(ele, 'movieId'), getattr(ele, 'rating')\n",
    "    if random.random() < pivot:\n",
    "        trainSet.setdefault(user, {})\n",
    "        trainSet[user][movie] = rating\n",
    "        trainSet_len += 1\n",
    "    else:\n",
    "        testSet.setdefault(user, {})\n",
    "        testSet[user][movie] = rating \n",
    "        testSet_len += 1\n",
    "\n",
    "print('Split trainingSet and testSet success!')\n",
    "print('TrainSet = %s' % trainSet_len)\n",
    "print('TestSet = %s' % testSet_len)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T06:44:53.871862Z",
     "start_time": "2020-08-27T06:44:53.458462Z"
    }
   },
   "source": [
    "## 建立模型\n",
    "这里用基础的SVD试试， 也就是那个带正则和偏置的那个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T06:46:03.503590Z",
     "start_time": "2020-08-27T06:46:03.490583Z"
    }
   },
   "outputs": [],
   "source": [
    "class BasicSVD():\n",
    "    def __init__(self, rating_data, F=5, alpha=0.1, lmbda=0.1, max_iter=100):\n",
    "        self.F = F           # 这个表示隐向量的维度\n",
    "        self.P = dict()          #  用户矩阵P  大小是[users_num, F]\n",
    "        self.Q = dict()     # 物品矩阵Q  大小是[item_nums, F]\n",
    "        self.bu = dict()   # 用户偏差系数\n",
    "        self.bi = dict()    # 物品偏差系数\n",
    "        self.mu = 0.0        # 全局偏差系数\n",
    "        self.alpha = alpha   # 学习率\n",
    "        self.lmbda = lmbda    # 正则项系数\n",
    "        self.max_iter = max_iter    # 最大迭代次数\n",
    "        self.rating_data = rating_data # 评分矩阵\n",
    "        \n",
    "        # 初始化矩阵P和Q, 方法很多， 一般用随机数填充， 但随机数大小有讲究， 根据经验， 随机数需要和1/sqrt(F)成正比\n",
    "        cnt = 0    # 统计总的打分数， 初始化mu用\n",
    "        for user, items in self.rating_data.items():\n",
    "            self.P[user] = [random.random() / math.sqrt(self.F)  for x in range(0, F)]\n",
    "            self.bu[user] = 0\n",
    "            cnt += len(items) \n",
    "            for item, rating in items.items():\n",
    "                if item not in self.Q:\n",
    "                    self.Q[item] = [random.random() / math.sqrt(self.F) for x in range(0, F)]\n",
    "                    self.bi[item] = 0\n",
    "        self.mu /= cnt\n",
    "        \n",
    "    # 有了矩阵之后， 就可以进行训练, 这里使用随机梯度下降的方式训练参数P和Q\n",
    "    def train(self):\n",
    "        for step in range(self.max_iter):\n",
    "            for user, items in self.rating_data.items():\n",
    "                for item, rui in items.items():\n",
    "                    rhat_ui = self.predict(user, item)   # 得到预测评分\n",
    "                    # 计算误差\n",
    "                    e_ui = rui - rhat_ui\n",
    "                    \n",
    "                    self.bu[user] += self.alpha * (e_ui - self.lmbda * self.bu[user])\n",
    "                    self.bi[item] += self.alpha * (e_ui - self.lmbda * self.bi[item])\n",
    "                    # 随机梯度下降更新梯度\n",
    "                    for k in range(0, self.F):\n",
    "                        self.P[user][k] += self.alpha * (e_ui*self.Q[item][k] - self.lmbda * self.P[user][k])\n",
    "                        self.Q[item][k] += self.alpha * (e_ui*self.P[user][k] - self.lmbda * self.Q[item][k])\n",
    "                    \n",
    "            self.alpha *= 0.1    # 每次迭代步长要逐步缩小\n",
    "    \n",
    "    # 预测user对item的评分， 这里没有使用向量的形式\n",
    "    def predict(self, user, item):\n",
    "        return sum(self.P[user][f] * self.Q[item][f] for f in range(0, self.F)) + self.bu[user] + self.bi[item] + self.mu   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T07:05:00.972146Z",
     "start_time": "2020-08-27T06:48:27.130325Z"
    }
   },
   "outputs": [],
   "source": [
    "basicsvd = BasicSVD(trainSet, F=128)\n",
    "basicsvd.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 产生推荐列表， 并评估结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T07:35:45.529582Z",
     "start_time": "2020-08-27T07:35:44.160983Z"
    }
   },
   "outputs": [],
   "source": [
    "# 这里产生推荐列表， 遍历物品列表， 如果用户看了， 那么就跳过， 否则， 预测用户对该电影的打分， 然后记录， 最后排名\n",
    "movie_list = []\n",
    "for user, items in trainSet.items():\n",
    "    for item in items.keys():\n",
    "        if item not in movie_list:\n",
    "            movie_list.append(item)\n",
    "\n",
    "\n",
    "def recommend(aim_user, n=10):\n",
    "    rank = {}\n",
    "    watched_movies = trainSet[aim_user] # 目标用户看过的电影\n",
    "    \n",
    "    for movie in movie_list:\n",
    "        if movie in watched_movies:\n",
    "            continue\n",
    "        \n",
    "        # 如果当前用户没看过， 就预测打分， 并保存到rank\n",
    "        rank[movie] = basicsvd.predict(aim_user, movie)\n",
    "        \n",
    "    \n",
    "    return sorted(rank.items(), key=lambda x: x[1], reverse=True)[:n] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T07:35:50.419058Z",
     "start_time": "2020-08-27T07:35:50.129281Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(14, 6.912961297646142),\n",
       " (28, 6.896711491633388),\n",
       " (247, 6.627213206884909),\n",
       " (246, 6.443379973996613),\n",
       " (162, 6.370138897598569),\n",
       " (176, 6.359571028111279),\n",
       " (18, 6.307755110893343),\n",
       " (101, 6.306581248928255),\n",
       " (69, 6.19060919674639),\n",
       " (272, 6.180703380315263)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 产生推荐列表\n",
    "recommend(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T07:43:35.488830Z",
     "start_time": "2020-08-27T07:40:31.723340Z"
    }
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'movie_count' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-21-0f4b4fb91ada>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[0mprecision\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhit\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;36m1.0\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mrec_count\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     35\u001b[0m \u001b[0mrecall\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhit\u001b[0m \u001b[1;33m/\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;36m1.0\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mtest_count\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mcoverage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mall_rec_movies\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mmovie_count\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     37\u001b[0m \u001b[0mret\u001b[0m \u001b[1;33m/=\u001b[0m \u001b[0mret_cou\u001b[0m\u001b[1;33m*\u001b[0m\u001b[1;36m1.0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'movie_count' is not defined"
     ]
    }
   ],
   "source": [
    "# 准确率、召回率和覆盖率\n",
    "n = 10     # 推荐用户评分高的前10部\n",
    "hit = 0\n",
    "rec_count = 0     # 统计推荐的影片数量， 计算查准率\n",
    "test_count = 0    # 统计测试集的影片数量， 计算查全率\n",
    "all_rec_movies = set()    # 统计被推荐出来的影片个数， 无重复了， 为了计算覆盖率\n",
    "item_populatity = dict()   # 计算新颖度\n",
    "\n",
    "# 先计算每部影片的流行程度\n",
    "for user, items in trainSet.items():\n",
    "    for item in items.keys():\n",
    "        if item not in item_populatity:\n",
    "            item_populatity[item] = 0\n",
    "        item_populatity[item] += 1    # 这里统计训练集中每部影片用户观看的总次数， 代表每部影片的流行程度\n",
    "\n",
    "\n",
    "# 计算评测指标\n",
    "ret = 0\n",
    "ret_cou = 0\n",
    "for user, items in trainSet.items():    # 这里得保证测试集里面的用户在训练集里面才能推荐\n",
    "    \n",
    "    test_movies = testSet.get(user, {})\n",
    "    rec_movies = recommend(user)\n",
    "    for movie, w in rec_movies:\n",
    "        if movie in test_movies:\n",
    "            hit += 1\n",
    "        all_rec_movies.add(movie)\n",
    "        ret += math.log(1+item_populatity[movie])\n",
    "        ret_cou += 1\n",
    "    rec_count += n\n",
    "    test_count += len(test_movies)\n",
    "    \n",
    "    \n",
    "precision = hit / (1.0 * rec_count)\n",
    "recall = hit / (1.0 * test_count)\n",
    "coverage = len(all_rec_movies) / len(item_populatity)\n",
    "ret /= ret_cou*1.0\n",
    "    \n",
    "print('precisioin = %.4f\\nrecall = %.4f\\ncoverage = %.4f\\npopularity = %.4f' % (precision, recall, coverage, ret))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-27T07:45:36.341811Z",
     "start_time": "2020-08-27T07:45:36.336824Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precisioin = 0.0072\n",
      "recall = 0.0017\n",
      "coverage = 0.0022\n",
      "popularity = 2.7131\n"
     ]
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
